In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import radians, sin, cos, sqrt, atan2
import warnings
import folium 

warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer

from sklearn.model_selection import train_test_split
In [2]:
df = pd.read_csv("C:\\Users\\TransOrg\\Downloads\\finalTrain.csv")
df.head()
Out[2]:
ID Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Order_Date Time_Orderd Time_Order_picked Weather_conditions Road_traffic_density Vehicle_condition Type_of_order Type_of_vehicle multiple_deliveries Festival City Time_taken (min)
0 0xcdcd DEHRES17DEL01 36.0 4.2 30.327968 78.046106 30.397968 78.116106 12-02-2022 21:55 22:10 Fog Jam 2 Snack motorcycle 3.0 No Metropolitian 46
1 0xd987 KOCRES16DEL01 21.0 4.7 10.003064 76.307589 10.043064 76.347589 13-02-2022 14:55 15:05 Stormy High 1 Meal motorcycle 1.0 No Metropolitian 23
2 0x2784 PUNERES13DEL03 23.0 4.7 18.562450 73.916619 18.652450 74.006619 04-03-2022 17:30 17:40 Sandstorms Medium 1 Drinks scooter 1.0 No Metropolitian 21
3 0xc8b6 LUDHRES15DEL02 34.0 4.3 30.899584 75.809346 30.919584 75.829346 13-02-2022 09:20 09:30 Sandstorms Low 0 Buffet motorcycle 0.0 No Metropolitian 20
4 0xdb64 KNPRES14DEL02 24.0 4.7 26.463504 80.372929 26.593504 80.502929 14-02-2022 19:50 20:05 Fog Jam 1 Snack scooter 1.0 No Metropolitian 41
In [3]:
df.shape
Out[3]:
(45584, 20)
In [4]:
df.describe()
Out[4]:
Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Vehicle_condition multiple_deliveries Time_taken (min)
count 43730.000000 43676.000000 45584.000000 45584.000000 45584.000000 45584.000000 45584.000000 44591.000000 45584.000000
mean 29.566911 4.633774 17.017948 70.229684 17.465480 70.844161 1.023385 0.744635 26.293963
std 5.815064 0.334744 8.185674 22.885575 7.335562 21.120578 0.839055 0.572510 9.384298
min 15.000000 1.000000 -30.905562 -88.366217 0.010000 0.010000 0.000000 0.000000 10.000000
25% 25.000000 4.500000 12.933284 73.170000 12.988453 73.280000 0.000000 0.000000 19.000000
50% 30.000000 4.700000 18.551440 75.897963 18.633934 76.002574 1.000000 1.000000 26.000000
75% 35.000000 4.900000 22.728163 78.044095 22.785049 78.107044 2.000000 1.000000 32.000000
max 50.000000 6.000000 30.914057 88.433452 31.054057 88.563452 3.000000 3.000000 54.000000
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45584 non-null  object 
 1   Delivery_person_ID           45584 non-null  object 
 2   Delivery_person_Age          43730 non-null  float64
 3   Delivery_person_Ratings      43676 non-null  float64
 4   Restaurant_latitude          45584 non-null  float64
 5   Restaurant_longitude         45584 non-null  float64
 6   Delivery_location_latitude   45584 non-null  float64
 7   Delivery_location_longitude  45584 non-null  float64
 8   Order_Date                   45584 non-null  object 
 9   Time_Orderd                  43853 non-null  object 
 10  Time_Order_picked            45584 non-null  object 
 11  Weather_conditions           44968 non-null  object 
 12  Road_traffic_density         44983 non-null  object 
 13  Vehicle_condition            45584 non-null  int64  
 14  Type_of_order                45584 non-null  object 
 15  Type_of_vehicle              45584 non-null  object 
 16  multiple_deliveries          44591 non-null  float64
 17  Festival                     45356 non-null  object 
 18  City                         44384 non-null  object 
 19  Time_taken (min)             45584 non-null  int64  
dtypes: float64(7), int64(2), object(11)
memory usage: 7.0+ MB
In [6]:
df.isnull().sum()
Out[6]:
ID                                0
Delivery_person_ID                0
Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Weather_conditions              616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken (min)                  0
dtype: int64
In [7]:
for i in df.columns:
    print(f" no. of unique value in {i}  is   : {df[i].nunique()}")
    print("#######################################################")
    if (df[i].nunique()) < 8:
        print(f"           unique value are : {df[i].unique()}")
        print("***************************************************")
    else:
        pass
 no. of unique value in ID  is   : 45584
#######################################################
 no. of unique value in Delivery_person_ID  is   : 1320
#######################################################
 no. of unique value in Delivery_person_Age  is   : 22
#######################################################
 no. of unique value in Delivery_person_Ratings  is   : 28
#######################################################
 no. of unique value in Restaurant_latitude  is   : 657
#######################################################
 no. of unique value in Restaurant_longitude  is   : 518
#######################################################
 no. of unique value in Delivery_location_latitude  is   : 4373
#######################################################
 no. of unique value in Delivery_location_longitude  is   : 4373
#######################################################
 no. of unique value in Order_Date  is   : 44
#######################################################
 no. of unique value in Time_Orderd  is   : 176
#######################################################
 no. of unique value in Time_Order_picked  is   : 193
#######################################################
 no. of unique value in Weather_conditions  is   : 6
#######################################################
           unique value are : ['Fog' 'Stormy' 'Sandstorms' 'Windy' 'Cloudy' 'Sunny' nan]
***************************************************
 no. of unique value in Road_traffic_density  is   : 4
#######################################################
           unique value are : ['Jam' 'High' 'Medium' 'Low' nan]
***************************************************
 no. of unique value in Vehicle_condition  is   : 4
#######################################################
           unique value are : [2 1 0 3]
***************************************************
 no. of unique value in Type_of_order  is   : 4
#######################################################
           unique value are : ['Snack' 'Meal' 'Drinks' 'Buffet']
***************************************************
 no. of unique value in Type_of_vehicle  is   : 4
#######################################################
           unique value are : ['motorcycle' 'scooter' 'electric_scooter' 'bicycle']
***************************************************
 no. of unique value in multiple_deliveries  is   : 4
#######################################################
           unique value are : [ 3.  1.  0. nan  2.]
***************************************************
 no. of unique value in Festival  is   : 2
#######################################################
           unique value are : ['No' 'Yes' nan]
***************************************************
 no. of unique value in City  is   : 3
#######################################################
           unique value are : ['Metropolitian' 'Urban' 'Semi-Urban' nan]
***************************************************
 no. of unique value in Time_taken (min)  is   : 45
#######################################################
In [8]:
# lets create some features with the help of our existing features

feature = []
dtypes = []
count = []
unique = []
missing = []
missing_percentage=[]

for column in df.columns:
    feature.append(column)
    count.append(len(df[column]))
    missing.append(df[column].isnull().sum())
    unique.append(df[column].nunique())
    missing_percentage.append((df[column].isnull().sum() / df.shape[0]) * 100)
    dtypes.append(df[column].dtypes)
    
dataframe=pd.DataFrame({'feature':feature,
             'count':count,
             'missing':missing,
             'nunique':unique,
              'missing_percentage':missing_percentage,
              'dtypes':dtypes
              
             }) 
dataframe.set_index('feature')
Out[8]:
count missing nunique missing_percentage dtypes
feature
ID 45584 0 45584 0.000000 object
Delivery_person_ID 45584 0 1320 0.000000 object
Delivery_person_Age 45584 1854 22 4.067217 float64
Delivery_person_Ratings 45584 1908 28 4.185679 float64
Restaurant_latitude 45584 0 657 0.000000 float64
Restaurant_longitude 45584 0 518 0.000000 float64
Delivery_location_latitude 45584 0 4373 0.000000 float64
Delivery_location_longitude 45584 0 4373 0.000000 float64
Order_Date 45584 0 44 0.000000 object
Time_Orderd 45584 1731 176 3.797385 object
Time_Order_picked 45584 0 193 0.000000 object
Weather_conditions 45584 616 6 1.351351 object
Road_traffic_density 45584 601 4 1.318445 object
Vehicle_condition 45584 0 4 0.000000 int64
Type_of_order 45584 0 4 0.000000 object
Type_of_vehicle 45584 0 4 0.000000 object
multiple_deliveries 45584 993 4 2.178396 float64
Festival 45584 228 2 0.500176 object
City 45584 1200 3 2.632503 object
Time_taken (min) 45584 0 45 0.000000 int64
In [9]:
df.drop('ID', axis=1, inplace=True)
In [10]:
df
Out[10]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Order_Date Time_Orderd Time_Order_picked Weather_conditions Road_traffic_density Vehicle_condition Type_of_order Type_of_vehicle multiple_deliveries Festival City Time_taken (min)
0 DEHRES17DEL01 36.0 4.2 30.327968 78.046106 30.397968 78.116106 12-02-2022 21:55 22:10 Fog Jam 2 Snack motorcycle 3.0 No Metropolitian 46
1 KOCRES16DEL01 21.0 4.7 10.003064 76.307589 10.043064 76.347589 13-02-2022 14:55 15:05 Stormy High 1 Meal motorcycle 1.0 No Metropolitian 23
2 PUNERES13DEL03 23.0 4.7 18.562450 73.916619 18.652450 74.006619 04-03-2022 17:30 17:40 Sandstorms Medium 1 Drinks scooter 1.0 No Metropolitian 21
3 LUDHRES15DEL02 34.0 4.3 30.899584 75.809346 30.919584 75.829346 13-02-2022 09:20 09:30 Sandstorms Low 0 Buffet motorcycle 0.0 No Metropolitian 20
4 KNPRES14DEL02 24.0 4.7 26.463504 80.372929 26.593504 80.502929 14-02-2022 19:50 20:05 Fog Jam 1 Snack scooter 1.0 No Metropolitian 41
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45579 JAPRES04DEL01 30.0 4.8 26.902328 75.794257 26.912328 75.804257 24-03-2022 11:35 11:45 Windy High 1 Meal motorcycle 0.0 No Metropolitian 32
45580 AGRRES16DEL01 21.0 4.6 0.000000 0.000000 0.070000 0.070000 16-02-2022 19:55 20:10 Windy Jam 0 Buffet motorcycle 1.0 No Metropolitian 36
45581 CHENRES08DEL03 30.0 4.9 13.022394 80.242439 13.052394 80.272439 11-03-2022 23:50 24:05:00 Cloudy Low 1 Drinks scooter 0.0 No Metropolitian 16
45582 COIMBRES11DEL01 20.0 4.7 11.001753 76.986241 11.041753 77.026241 07-03-2022 13:35 13:40 Cloudy High 0 Snack motorcycle 1.0 No Metropolitian 26
45583 RANCHIRES09DEL02 23.0 4.9 23.351058 85.325731 23.431058 85.405731 02-03-2022 17:10 17:15 Fog Medium 2 Snack scooter 1.0 No Metropolitian 36

45584 rows × 19 columns

In [11]:
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y')
df.head()
Out[11]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Order_Date Time_Orderd Time_Order_picked Weather_conditions Road_traffic_density Vehicle_condition Type_of_order Type_of_vehicle multiple_deliveries Festival City Time_taken (min)
0 DEHRES17DEL01 36.0 4.2 30.327968 78.046106 30.397968 78.116106 2022-02-12 21:55 22:10 Fog Jam 2 Snack motorcycle 3.0 No Metropolitian 46
1 KOCRES16DEL01 21.0 4.7 10.003064 76.307589 10.043064 76.347589 2022-02-13 14:55 15:05 Stormy High 1 Meal motorcycle 1.0 No Metropolitian 23
2 PUNERES13DEL03 23.0 4.7 18.562450 73.916619 18.652450 74.006619 2022-03-04 17:30 17:40 Sandstorms Medium 1 Drinks scooter 1.0 No Metropolitian 21
3 LUDHRES15DEL02 34.0 4.3 30.899584 75.809346 30.919584 75.829346 2022-02-13 09:20 09:30 Sandstorms Low 0 Buffet motorcycle 0.0 No Metropolitian 20
4 KNPRES14DEL02 24.0 4.7 26.463504 80.372929 26.593504 80.502929 2022-02-14 19:50 20:05 Fog Jam 1 Snack scooter 1.0 No Metropolitian 41
In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Delivery_person_ID           45584 non-null  object        
 1   Delivery_person_Age          43730 non-null  float64       
 2   Delivery_person_Ratings      43676 non-null  float64       
 3   Restaurant_latitude          45584 non-null  float64       
 4   Restaurant_longitude         45584 non-null  float64       
 5   Delivery_location_latitude   45584 non-null  float64       
 6   Delivery_location_longitude  45584 non-null  float64       
 7   Order_Date                   45584 non-null  datetime64[ns]
 8   Time_Orderd                  43853 non-null  object        
 9   Time_Order_picked            45584 non-null  object        
 10  Weather_conditions           44968 non-null  object        
 11  Road_traffic_density         44983 non-null  object        
 12  Vehicle_condition            45584 non-null  int64         
 13  Type_of_order                45584 non-null  object        
 14  Type_of_vehicle              45584 non-null  object        
 15  multiple_deliveries          44591 non-null  float64       
 16  Festival                     45356 non-null  object        
 17  City                         44384 non-null  object        
 18  Time_taken (min)             45584 non-null  int64         
dtypes: datetime64[ns](1), float64(7), int64(2), object(9)
memory usage: 6.6+ MB
In [13]:
df['year']= df['Order_Date'].dt.year
df['month']= df['Order_Date'].dt.month
df['day']= df['Order_Date'].dt.day
In [14]:
df.head()
Out[14]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Order_Date Time_Orderd Time_Order_picked ... Vehicle_condition Type_of_order Type_of_vehicle multiple_deliveries Festival City Time_taken (min) year month day
0 DEHRES17DEL01 36.0 4.2 30.327968 78.046106 30.397968 78.116106 2022-02-12 21:55 22:10 ... 2 Snack motorcycle 3.0 No Metropolitian 46 2022 2 12
1 KOCRES16DEL01 21.0 4.7 10.003064 76.307589 10.043064 76.347589 2022-02-13 14:55 15:05 ... 1 Meal motorcycle 1.0 No Metropolitian 23 2022 2 13
2 PUNERES13DEL03 23.0 4.7 18.562450 73.916619 18.652450 74.006619 2022-03-04 17:30 17:40 ... 1 Drinks scooter 1.0 No Metropolitian 21 2022 3 4
3 LUDHRES15DEL02 34.0 4.3 30.899584 75.809346 30.919584 75.829346 2022-02-13 09:20 09:30 ... 0 Buffet motorcycle 0.0 No Metropolitian 20 2022 2 13
4 KNPRES14DEL02 24.0 4.7 26.463504 80.372929 26.593504 80.502929 2022-02-14 19:50 20:05 ... 1 Snack scooter 1.0 No Metropolitian 41 2022 2 14

5 rows × 22 columns

In [15]:
df.drop('Order_Date',axis=1,inplace=True)
In [16]:
df.dropna(subset=['Time_Orderd'],inplace=True)
In [17]:
df['Time_Orderd'] = df['Time_Orderd'].str.replace('.', ':')
In [18]:
df['Time_Orderd'].sample(10)
Out[18]:
10882    21:25
1257     18:15
32630    10:50
27965    17:55
11534    14:50
40590    23:30
43193    21:50
5624     10:30
15060    17:30
38883    14:30
Name: Time_Orderd, dtype: object
In [19]:
# Define a lambda function with error handling

def extract_time(x):
    try:
        return x.split(':')[0] + ':' + x.split(':')[1][:2]
    except IndexError:
        return '00:00'

# Apply the lambda function to the Order_Time column
df['Time_Orderd'] = df['Time_Orderd'].apply(extract_time)
In [20]:
# Convert the Order_Time column to a time data type
df['Time_Orderd'] = pd.to_datetime(df['Time_Orderd'], format='%H:%M:%S',errors='ignore')
In [21]:
df['TimeOrder_Hour']=df['Time_Orderd'].str.split(':',expand=True)[0] # 12:00 -> 12, 00


# replacing '0' with '00'  - both indicate 24th hr 
df['TimeOrder_Hour']=df['TimeOrder_Hour'].replace('0','00') #12:00

# changing the datatype
df['TimeOrder_Hour']=df['TimeOrder_Hour'].astype('int32')
In [22]:
# order_minute
df['TimeOrder_min']=df['Time_Orderd'].str.split(':',expand=True)[1]

# changing the datatype
df['TimeOrder_min']=df['TimeOrder_min'].astype('int32')
In [23]:
df.head()
Out[23]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Time_Orderd Time_Order_picked Weather_conditions ... Type_of_vehicle multiple_deliveries Festival City Time_taken (min) year month day TimeOrder_Hour TimeOrder_min
0 DEHRES17DEL01 36.0 4.2 30.327968 78.046106 30.397968 78.116106 21:55 22:10 Fog ... motorcycle 3.0 No Metropolitian 46 2022 2 12 21 55
1 KOCRES16DEL01 21.0 4.7 10.003064 76.307589 10.043064 76.347589 14:55 15:05 Stormy ... motorcycle 1.0 No Metropolitian 23 2022 2 13 14 55
2 PUNERES13DEL03 23.0 4.7 18.562450 73.916619 18.652450 74.006619 17:30 17:40 Sandstorms ... scooter 1.0 No Metropolitian 21 2022 3 4 17 30
3 LUDHRES15DEL02 34.0 4.3 30.899584 75.809346 30.919584 75.829346 09:20 09:30 Sandstorms ... motorcycle 0.0 No Metropolitian 20 2022 2 13 9 20
4 KNPRES14DEL02 24.0 4.7 26.463504 80.372929 26.593504 80.502929 19:50 20:05 Fog ... scooter 1.0 No Metropolitian 41 2022 2 14 19 50

5 rows × 23 columns

In ouir TimeOrder_Hour columns¶

¶

7AM -12PM - breakfash##¶

12PM -15 PM - lunc##h¶

15PM-17 PM- after lun##ch¶

17-19 - evening sn##aks¶

19-24 - Dinner¶

In [24]:
df['Time_Order_picked'] = df['Time_Order_picked'].str.replace('.', ':')
In [25]:
# Define a lambda function with error handling
def extract_time(x):
    try:
        return x.split(':')[0] + ':' + x.split(':')[1][:2]
    except IndexError:
        return '00:00'

# Apply the lambda function to the Order_Time column
df['Time_Order_picked'] = df['Time_Order_picked'].apply(extract_time)
In [26]:
# Convert the Order_Time column to a time data type
df['Time_Order_picked'] = pd.to_datetime(df['Time_Order_picked'], format='%H:%M:%S',errors='ignore')
In [27]:
df['Time_Order_picked_Hour']=df['Time_Order_picked'].str.split(':',expand=True)[0]
# changing the datatype
df['Time_Order_picked_Hour']=df['Time_Order_picked_Hour'].astype('int32')
In [28]:
# order_minute
df['Time_Order_picked_min']=df['Time_Order_picked'].str.split(':',expand=True)[1]

# changing the datatype
df['Time_Order_picked_min']=df['Time_Order_picked_min'].astype('int32')

extracting city from delivery person ID

In [29]:
# spilliting the Delivery_person_id  on the bases of "RES", this will give us the city name of delivery

df['Delivery_city']=df['Delivery_person_ID'].str.split('RES',expand=True)[0]
In [30]:
df.head()
Out[30]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Time_Orderd Time_Order_picked Weather_conditions ... City Time_taken (min) year month day TimeOrder_Hour TimeOrder_min Time_Order_picked_Hour Time_Order_picked_min Delivery_city
0 DEHRES17DEL01 36.0 4.2 30.327968 78.046106 30.397968 78.116106 21:55 22:10 Fog ... Metropolitian 46 2022 2 12 21 55 22 10 DEH
1 KOCRES16DEL01 21.0 4.7 10.003064 76.307589 10.043064 76.347589 14:55 15:05 Stormy ... Metropolitian 23 2022 2 13 14 55 15 5 KOC
2 PUNERES13DEL03 23.0 4.7 18.562450 73.916619 18.652450 74.006619 17:30 17:40 Sandstorms ... Metropolitian 21 2022 3 4 17 30 17 40 PUNE
3 LUDHRES15DEL02 34.0 4.3 30.899584 75.809346 30.919584 75.829346 09:20 09:30 Sandstorms ... Metropolitian 20 2022 2 13 9 20 9 30 LUDH
4 KNPRES14DEL02 24.0 4.7 26.463504 80.372929 26.593504 80.502929 19:50 20:05 Fog ... Metropolitian 41 2022 2 14 19 50 20 5 KNP

5 rows × 26 columns

In [31]:
df['Delivery_city'].unique()
Out[31]:
array(['DEH', 'KOC', 'PUNE', 'LUDH', 'KNP', 'MUM', 'MYS', 'HYD', 'KOL',
       'RANCHI', 'COIMB', 'CHEN', 'JAP', 'SUR', 'BANG', 'GOA', 'AURG',
       'AGR', 'VAD', 'ALH', 'BHP', 'INDO'], dtype=object)
In [32]:
df.columns
Out[32]:
Index(['Delivery_person_ID', 'Delivery_person_Age', 'Delivery_person_Ratings',
       'Restaurant_latitude', 'Restaurant_longitude',
       'Delivery_location_latitude', 'Delivery_location_longitude',
       'Time_Orderd', 'Time_Order_picked', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Time_taken (min)', 'year', 'month', 'day', 'TimeOrder_Hour',
       'TimeOrder_min', 'Time_Order_picked_Hour', 'Time_Order_picked_min',
       'Delivery_city'],
      dtype='object')
In [33]:
df['Delivery_person_Ratings'].isnull().sum()
Out[33]:
268

filling the null value with the mean of that person id¶

In [34]:
rating_map = round(df.groupby('Delivery_person_ID')['Delivery_person_Ratings'].mean(),1).to_dict()

df['Delivery_person_Ratings']=df['Delivery_person_Ratings'].fillna(df['Delivery_person_ID'].map(rating_map))
df['Delivery_person_Ratings'].isnull().sum()
Out[34]:
0

working with MAP¶

In [35]:
folium.Map(tiles="cartodb positron")
Out[35]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [36]:
import plotly.express as px

# Make a copy of the DataFrame
data1 = df.copy()

# Drop rows with any missing values
data1.dropna(axis=0, how='any', subset=['Restaurant_latitude', 'Restaurant_longitude'], inplace=True)

# Define the color scale
color_scale = [(0, 'orange'), (1, 'red')]

# Create the scatter mapbox plot
fig = px.scatter_mapbox(
    data1, 
    lat="Restaurant_latitude", 
    lon="Restaurant_longitude", 
    color_continuous_scale=color_scale, 
    zoom=8, 
    height=1000, 
    width=1000
)

# Update the map layout to use open-street-map style
fig.update_layout(mapbox_style="open-street-map")

# Remove the margins
fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})

# Show the plot
fig.show()
In [37]:
# latitude of india is 8°4` N  to 37°6` North
# longitude of india is 68°7` E to 97°25` East

# this data set is realted to india. but in some of latitidude is given in negative(-), negative lattitude is related
# to indian ocean ..

# so that there is much chances that negative sign in lattitude is by mistake
In [38]:
# changing -ve to +ve dataset
"""
abs() function return the absolute value. The absolute
value of any number is always positive it removes the negative sign of a number in Python.
"""
df['Restaurant_latitude'] = df['Restaurant_latitude'].abs()

To calculate the distance between two points on a sphere (such as the Earth), we can use the Haversine formula¶

In [39]:
import math

def distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    R = 6371.0 # Earth's radius in km
    dist = R * c
    
    return dist
df['distance'] = df.apply(lambda row: distance(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
In [40]:
df['distance'].head()
Out[40]:
0    10.280582
1     6.242319
2    13.787860
3     2.930258
4    19.396618
Name: distance, dtype: float64
In [41]:
df.groupby(['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude'])['distance'].max().head(40)
Out[41]:
Restaurant_latitude  Restaurant_longitude  Delivery_location_latitude  Delivery_location_longitude
0.000000             0.000000              0.010000                    0.010000                        1.572534
                                           0.020000                    0.020000                        3.145067
                                           0.030000                    0.030000                        4.717601
                                           0.040000                    0.040000                        6.290135
                                           0.050000                    0.050000                        7.862668
                                           0.060000                    0.060000                        9.435202
                                           0.070000                    0.070000                       11.007735
                                           0.080000                    0.080000                       12.580268
                                           0.090000                    0.090000                       14.152801
                                           0.110000                    0.110000                       17.297866
                                           0.130000                    0.130000                       20.442930
9.957144             76.296783             9.967144                    76.306783                       1.560724
                                           9.977144                    76.316783                       3.121424
                                           9.987144                    76.326783                       4.682100
                                           9.997144                    76.336783                       6.242753
                                           10.007144                   76.346783                       7.803382
                                           10.017144                   76.356783                       9.363988
                                           10.027144                   76.366783                      10.924570
                                           10.037144                   76.376783                      12.485128
                                           10.047144                   76.386783                      14.045662
                                           10.067144                   76.406783                      17.166660
                                           10.087144                   76.426783                      20.287562
9.959778             76.296106             9.969778                    76.306106                       1.560717
                                           9.979778                    76.316106                       3.121411
                                           9.989778                    76.326106                       4.682082
                                           9.999778                    76.336106                       6.242728
                                           10.009778                   76.346106                       7.803351
                                           10.019778                   76.356106                       9.363950
                                           10.029778                   76.366106                      10.924526
                                           10.039778                   76.376106                      12.485078
                                           10.049778                   76.386106                      14.045606
                                           10.069778                   76.406106                      17.166591
                                           10.089778                   76.426106                      20.287481
9.960846             76.293936             9.970846                    76.303936                       1.560715
                                           9.980846                    76.313936                       3.121406
                                           9.990846                    76.323936                       4.682074
                                           10.000846                   76.333936                       6.242718
                                           10.010846                   76.343936                       7.803339
                                           10.020846                   76.353936                       9.363935
                                           10.030846                   76.363936                      10.924508
Name: distance, dtype: float64
In [42]:
# droping those lattitude and longitude which are less than and above to
# latitude of india is 8°4` N  to 37°6` North
# longitude of india is 68°7` E to 97°25` East
In [43]:
df[(df['Restaurant_latitude'] < 8) & (df['Delivery_location_latitude'] < 8)].shape
Out[43]:
(3509, 27)
In [44]:
# droping  where  restaurant_lattitude and delivery_location_latitude less than 8

df = df.drop(df[(df['Restaurant_latitude'] < 8) & (df['Delivery_location_latitude'] < 8)].index)
In [45]:
df[(df['Restaurant_latitude'] > 37) & (df['Delivery_location_latitude'] > 37)]
Out[45]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Time_Orderd Time_Order_picked Weather_conditions ... Time_taken (min) year month day TimeOrder_Hour TimeOrder_min Time_Order_picked_Hour Time_Order_picked_min Delivery_city distance

0 rows × 27 columns

In [46]:
df[(df['Restaurant_longitude'] < 68) & (df['Delivery_location_longitude'] < 68)]
Out[46]:
Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Time_Orderd Time_Order_picked Weather_conditions ... Time_taken (min) year month day TimeOrder_Hour TimeOrder_min Time_Order_picked_Hour Time_Order_picked_min Delivery_city distance

0 rows × 27 columns

In [47]:
df.columns
Out[47]:
Index(['Delivery_person_ID', 'Delivery_person_Age', 'Delivery_person_Ratings',
       'Restaurant_latitude', 'Restaurant_longitude',
       'Delivery_location_latitude', 'Delivery_location_longitude',
       'Time_Orderd', 'Time_Order_picked', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Time_taken (min)', 'year', 'month', 'day', 'TimeOrder_Hour',
       'TimeOrder_min', 'Time_Order_picked_Hour', 'Time_Order_picked_min',
       'Delivery_city', 'distance'],
      dtype='object')
In [48]:
min_distance=df.groupby('distance')[['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude',
                        'Delivery_location_longitude']].min().sort_values(by='distance',ascending=True).head(1)
min_distance
Out[48]:
Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude
distance
1.465067 30.914057 75.83982 30.924057 75.84982
In [49]:
import folium
from branca.element import Figure
fig2=Figure(width=850,height=550)

# Create a map object centered on a specific location
m = folium.Map(location=[30.914057, 75.83982], zoom_start=12)

# Add a marker for the restaurant location
folium.Marker(location=[30.914057, 75.83982], popup="Restaurant",icon=folium.Icon(color='purple',prefix='fa',icon='anchor')).add_to(m)

# Add a marker for the delivery location
folium.Marker(location=[30.924057, 75.84982], popup="Delivery").add_to(m)

# Draw a line between the two locations
folium.PolyLine(locations=[[30.914057, 75.83982], [30.924057, 75.84982]], color='red').add_to(m)

# Display the map

fig2.add_child(m)
folium.TileLayer('Stamen Terrain').add_to(m)
folium.TileLayer('Stamen Toner').add_to(m)
folium.TileLayer('Stamen Water Color').add_to(m)
folium.TileLayer('cartodbpositron').add_to(m)
folium.TileLayer('cartodbdark_matter').add_to(m)
folium.LayerControl().add_to(m)
m 
Out[49]:
In [50]:
max_distance=df.groupby('distance')[['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude',
                        'Delivery_location_longitude']].max().sort_values(by='distance',ascending=False).head(1)
max_distance
Out[50]:
Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude
distance
20.969489 25.443994 81.860187 25.583994 82.000187
In [51]:
tooltips=f"Distance: {df['distance'].max()} km"
In [52]:
import folium
from branca.element import Figure
fig2=Figure(width=850,height=550)

# Create a map object centered on a specific location
m = folium.Map(location=[25.443994, 81.860187], zoom_start=12)

# Add a marker for the restaurant location
folium.Marker(location=[25.443994, 81.860187], popup="Restaurant",icon=folium.Icon(color='purple',prefix='fa',icon='anchor')).add_to(m)

# Add a marker for the delivery location
folium.Marker(location=[25.583994, 82.000187], popup="Delivery").add_to(m)

# Draw a line between the two locations
folium.PolyLine(locations=[[25.443994, 81.860187], [25.583994,82.000187]], color='red',tooltip=tooltips).add_to(m)

# Display the map

fig2.add_child(m)
folium.TileLayer('Stamen Terrain').add_to(m)
folium.TileLayer('Stamen Toner').add_to(m)
folium.TileLayer('Stamen Water Color').add_to(m)
folium.TileLayer('cartodbpositron').add_to(m)
folium.TileLayer('cartodbdark_matter').add_to(m)
folium.LayerControl().add_to(m)
m
Out[52]:
In [53]:
import plotly.express as px

# Copy the DataFrame
data1 = df.copy()

# Drop rows with any missing values
data1.dropna(axis=0, how='any', subset=['Restaurant_latitude', 'Restaurant_longitude'], inplace=True)

# Define the color scale
color_scale = [(0, 'orange'), (1, 'red')]

# Create the scatter mapbox plot
fig = px.scatter_mapbox(
    data1, 
    lat="Restaurant_latitude", 
    lon="Restaurant_longitude", 
    color_continuous_scale=color_scale, 
    zoom=8, 
    height=1000, 
    width=1000
)

# Update the map layout to use open-street-map style
fig.update_layout(mapbox_style="open-street-map")

# Remove the margins
fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})
In [54]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure with specified size
plt.figure(figsize=(12, 8))

# Use sns.countplot with the DataFrame and column name
sns.countplot(x='Delivery_city', data=df)

# Rotate the x-ticks for better readability
plt.xticks(rotation='vertical')

# Show the plot
plt.show()
No description has been provided for this image
In [55]:
plt.figure(figsize=(12, 8))

# Use sns.countplot with the DataFrame and column name specified via the x parameter
sns.countplot(x='Weather_conditions', data=df)

# Rotate the x-ticks for better readability
plt.xticks(rotation='vertical')

# Show the plot
plt.show()
No description has been provided for this image
In [56]:
plt.figure(figsize=(12,8))
sns.countplot(x='Road_traffic_density', data=df)
plt.xticks(rotation='vertical')
plt.show()
No description has been provided for this image
In [57]:
plt.figure(figsize=(12,8))
sns.countplot(x='day', data=df)
plt.xticks(rotation='vertical')
plt.show()
No description has been provided for this image
In [58]:
plt.figure(figsize=(12,8))
sns.countplot(x='Vehicle_condition', data=df)
plt.xticks(rotation='vertical')
plt.show()
No description has been provided for this image
In [59]:
numerical_feature=[feature for feature in df.columns if df[feature].dtypes != 'O']
numerical_feature
df.drop('Delivery_person_ID',axis=1,inplace=True)
df.drop(['Time_Orderd','Time_Order_picked'],axis=1,inplace=True)

Splitting in X and Y¶

In [60]:
X = df.drop(labels=['Time_taken (min)'],axis=1)
y = df[['Time_taken (min)']]
In [61]:
num_col=[feature for feature in df.columns if df[feature].dtypes != 'O']
num_col
Out[61]:
['Delivery_person_Age',
 'Delivery_person_Ratings',
 'Restaurant_latitude',
 'Restaurant_longitude',
 'Delivery_location_latitude',
 'Delivery_location_longitude',
 'Vehicle_condition',
 'multiple_deliveries',
 'Time_taken (min)',
 'year',
 'month',
 'day',
 'TimeOrder_Hour',
 'TimeOrder_min',
 'Time_Order_picked_Hour',
 'Time_Order_picked_min',
 'distance']
In [62]:
cat_col=[feature for feature in df.columns if df[feature].dtypes == 'O']
cat_col
Out[62]:
['Weather_conditions',
 'Road_traffic_density',
 'Type_of_order',
 'Type_of_vehicle',
 'Festival',
 'City',
 'Delivery_city']
In [63]:
df[cat_col].head()
Out[63]:
Weather_conditions Road_traffic_density Type_of_order Type_of_vehicle Festival City Delivery_city
0 Fog Jam Snack motorcycle No Metropolitian DEH
1 Stormy High Meal motorcycle No Metropolitian KOC
2 Sandstorms Medium Drinks scooter No Metropolitian PUNE
3 Sandstorms Low Buffet motorcycle No Metropolitian LUDH
4 Fog Jam Snack scooter No Metropolitian KNP
In [64]:
df[df['Road_traffic_density']=='Low']
Out[64]:
Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Weather_conditions Road_traffic_density Vehicle_condition Type_of_order ... Time_taken (min) year month day TimeOrder_Hour TimeOrder_min Time_Order_picked_Hour Time_Order_picked_min Delivery_city distance
3 34.0 4.3 30.899584 75.809346 30.919584 75.829346 Sandstorms Low 0 Buffet ... 20 2022 2 13 9 20 9 30 LUDH 2.930258
11 31.0 4.7 23.357804 85.325146 23.487804 85.455146 Sandstorms Low 2 Meal ... 26 2022 3 10 22 30 22 45 RANCHI 19.618739
12 37.0 5.0 11.003669 76.976494 11.013669 76.986494 Sandstorms Low 1 Snack ... 20 2022 3 11 8 15 8 30 COIMB 1.558132
17 35.0 4.3 21.160522 72.771477 21.250522 72.861477 Cloudy Low 1 Drinks ... 38 2022 3 12 23 20 23 30 SUR 13.682045
19 23.0 4.8 18.514210 73.838429 18.624210 73.948429 Windy Low 2 Buffet ... 12 2022 4 2 23 35 23 45 PUNE 16.853619
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45569 30.0 4.9 11.025083 77.015393 11.045083 77.035393 Windy Low 1 Snack ... 26 2022 3 11 10 25 10 40 COIMB 3.116127
45571 34.0 4.9 12.297954 76.665169 12.317954 76.685169 Sandstorms Low 1 Snack ... 26 2022 4 5 9 10 9 15 MYS 3.109134
45573 30.0 4.6 18.562450 73.916619 18.582450 73.936619 Cloudy Low 0 Meal ... 20 2022 4 1 0 37 9 15 PUNE 3.064267
45575 37.0 4.8 13.022394 80.242439 13.042394 80.262439 Sandstorms Low 2 Drinks ... 20 2022 4 5 9 35 9 50 CHEN 3.104827
45581 30.0 4.9 13.022394 80.242439 13.052394 80.272439 Cloudy Low 1 Drinks ... 16 2022 3 11 23 50 24 5 CHEN 4.657195

13815 rows × 24 columns

In [65]:
Road_traffic_density=['Low','Medium','High','Jam']
Weather_conditions=['Sunny','Cloudy','Windy','Fog','Sandstorms','Stormy']
In [66]:
categorical_column=['Type_of_order','Type_of_vehicle','Festival','City','Delivery_city']
ordinal_encod=['Road_traffic_density','Weather_conditions']
numerical_column=['Delivery_person_Age','Delivery_person_Ratings','Vehicle_condition','multiple_deliveries',
                  'TimeOrder_Hour','distance']
In [67]:
# numerical pipeline

numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='constant',fill_value=0)),
    ('scaler',StandardScaler(with_mean=False))
])

# categorical pipeline

categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore')),
    ('scaler',StandardScaler(with_mean=False))
])


# ordinal pipeline

ordianl_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder(categories=[Road_traffic_density,Weather_conditions])),
    ('scaler',StandardScaler(with_mean=False))   
])
In [68]:
preprocessor =ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_column),
    ('categorical_pipeline',categorical_pipeline,categorical_column),
    ('ordianl_pipeline',ordianl_pipeline,ordinal_encod)
])
In [69]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
In [70]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)
In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
In [72]:
def evaluate_reg(true, predicted):
    r2 = r2_score(true, predicted) # Calculate r2 score
    MAE = mean_absolute_error(true, predicted) # Calculate MAE
    MSE = mean_squared_error(true, predicted) # Calculate MSE
    rmse = np.sqrt(mean_squared_error(true, predicted))
   
    return r2, MAE , MSE,rmse
In [73]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

models = {
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "linear Regression": LinearRegression(),
    "Ridge":Ridge(),
    "XGBRegressor": XGBRegressor(), 
  
}

r2_list=[]
mse_list=[]
models_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model
   
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)



    # Test set performance
    r2,MAE,MSE,rmse=evaluate_reg(y_test, y_test_pred)
    r2_train,MAE_train,MSE_train,rmse_train=evaluate_reg(y_train, y_train_pred)
    

    print(list(models.keys())[i])
    models_list.append(list(models.keys())[i])
    

    
    print('Model performance for Test set')
    print('- r2_score: {:.4f}'.format(r2))
    print('- MAE: {:.4f}'.format(MAE))
    print('- MSE: {:.4f}'.format(MSE))
    print('- rmse: {:.4f}'.format(rmse))
    r2_list.append(r2)
    mse_list.append(MSE)
    
    print('-'*35)
        
    print('Model performance for train set')
    print('- r2_score: {:.4f}'.format(r2_train))
    print('- MAE: {:.4f}'.format(MAE_train))
    print('- MSE: {:.4f}'.format(MSE_train))
    print('- rmse: {:.4f}'.format(rmse_train))

    print('='*35)
    print('='*35)
    print('\n')
Random Forest
Model performance for Test set
- r2_score: 0.8283
- MAE: 3.1358
- MSE: 15.3218
- rmse: 3.9143
-----------------------------------
Model performance for train set
- r2_score: 0.9759
- MAE: 1.1483
- MSE: 2.1068
- rmse: 1.4515
===================================
===================================


Decision Tree
Model performance for Test set
- r2_score: 0.6935
- MAE: 3.9932
- MSE: 27.3604
- rmse: 5.2307
-----------------------------------
Model performance for train set
- r2_score: 1.0000
- MAE: 0.0000
- MSE: 0.0000
- rmse: 0.0000
===================================
===================================


Gradient Boosting
Model performance for Test set
- r2_score: 0.7697
- MAE: 3.6335
- MSE: 20.5584
- rmse: 4.5341
-----------------------------------
Model performance for train set
- r2_score: 0.7740
- MAE: 3.5680
- MSE: 19.7730
- rmse: 4.4467
===================================
===================================


linear Regression
Model performance for Test set
- r2_score: 0.5351
- MAE: 5.1319
- MSE: 41.4926
- rmse: 6.4415
-----------------------------------
Model performance for train set
- r2_score: 0.5396
- MAE: 5.0371
- MSE: 40.2769
- rmse: 6.3464
===================================
===================================


Ridge
Model performance for Test set
- r2_score: 0.5352
- MAE: 5.1301
- MSE: 41.4830
- rmse: 6.4407
-----------------------------------
Model performance for train set
- r2_score: 0.5396
- MAE: 5.0352
- MSE: 40.2714
- rmse: 6.3460
===================================
===================================


XGBRegressor
Model performance for Test set
- r2_score: 0.8288
- MAE: 3.1347
- MSE: 15.2790
- rmse: 3.9088
-----------------------------------
Model performance for train set
- r2_score: 0.8752
- MAE: 2.6442
- MSE: 10.9201
- rmse: 3.3046
===================================
===================================


In [74]:
Results = pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model Name', 'r2_score'])
Results.sort_values(by=["r2_score"],ascending=False)
Out[74]:
Model Name r2_score
5 XGBRegressor 0.828819
0 Random Forest 0.828340
2 Gradient Boosting 0.769671
1 Decision Tree 0.693463
4 Ridge 0.535238
3 linear Regression 0.535130
In [76]:
Results = pd.DataFrame(list(zip(models_list, mse_list)), columns=['Model Name', 'Mse'])
Results.sort_values(by=["Mse"],ascending=True)
Out[76]:
Model Name Mse
5 XGBRegressor 15.278972
0 Random Forest 15.321797
2 Gradient Boosting 20.558401
1 Decision Tree 27.360392
4 Ridge 41.482991
3 linear Regression 41.492630